{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SELECT from WORLD"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading spark-stubs, spark-hive\n",
      "Adding Hive conf dir /opt/hive/conf to classpath\n",
      "Creating SparkSession\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "SLF4J: No SLF4J providers were found.\n",
      "SLF4J: Defaulting to no-operation (NOP) logger implementation\n",
      "SLF4J: See https://www.slf4j.org/codes.html#noProviders for further details.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<a target=\"_blank\" href=\"http://jupyter:4041\">Spark UI</a>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36m$ivy.$                                  \n",
       "\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.log4j.{Level, Logger}\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.functions._\n",
       "\n",
       "\u001b[39m\n",
       "\u001b[36mspark\u001b[39m: \u001b[32mSparkSession\u001b[39m = org.apache.spark.sql.SparkSession@7b280c25"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import $ivy.`org.apache.spark::spark-sql:3.4.0`\n",
    "\n",
    "import org.apache.log4j.{Level, Logger}\n",
    "Logger.getLogger(\"org\").setLevel(Level.OFF)\n",
    "\n",
    "import org.apache.spark._\n",
    "import org.apache.spark.sql._\n",
    "import org.apache.spark.sql.functions._\n",
    "\n",
    "val spark = {\n",
    "    NotebookSparkSession.builder()\n",
    "    .progress(false)\n",
    "    .appName(\"app02\")\n",
    "    // .master(\"spark://192.168.31.31:7077\")\n",
    "    .master(\"local[*]\")\n",
    "    .config(\"spark.sql.warehouse.dir\", \n",
    "            \"hdfs://192.168.31.31:9000/user/hive/warehouse\") \n",
    "    .config(\"spark.cores.max\", \"4\") \n",
    "    .config(\"spark.executor.instances\", \"1\") \n",
    "    .config(\"spark.executor.cores\", \"2\") \n",
    "    .config(\"spark.executor.memory\", \"10g\") \n",
    "    .config(\"spark.shuffle.service.enabled\", \"false\") \n",
    "    .config(\"spark.dynamicAllocation.enabled\", \"false\") \n",
    "    .config(\"spark.sql.catalogImplementation\", \"hive\")\n",
    "    .config(\"spark.sql.repl.eagerEval.enabled\", \"true\")\n",
    "    .config(\"spark.driver.allowMultipleContexts\", \"true\")\n",
    "    .getOrCreate()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36mspark.implicits._\n",
       "\u001b[39m\n",
       "defined \u001b[32mfunction\u001b[39m \u001b[36msc\u001b[39m\n",
       "\u001b[36mhiveCxt\u001b[39m: \u001b[32msql\u001b[39m.\u001b[32mhive\u001b[39m.\u001b[32mHiveContext\u001b[39m = org.apache.spark.sql.hive.HiveContext@387b229"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import spark.implicits._\n",
    "def sc = spark.sparkContext\n",
    "val hiveCxt = new org.apache.spark.sql.hive.HiveContext(sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defined \u001b[32mclass\u001b[39m \u001b[36mRichDF\u001b[39m"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Credit to Aivean\n",
    "implicit class RichDF(val ds:DataFrame) {\n",
    "    def showHTML(limit: Int = 50, truncate: Int = 100) = {\n",
    "        import xml.Utility.escape\n",
    "        val data = ds.take(limit)\n",
    "        val header = ds.schema.fieldNames.toSeq        \n",
    "        val rows: Seq[Seq[String]] = data.map { row =>\n",
    "          row.toSeq.map {cell =>\n",
    "            val str = cell match {\n",
    "              case null => \"null\"\n",
    "              case binary: Array[Byte] => binary.map(\"%02X\".format(_)).mkString(\"[\", \" \", \"]\")\n",
    "              case array: Array[_] => array.mkString(\"[\", \", \", \"]\")\n",
    "              case seq: Seq[_] => seq.mkString(\"[\", \", \", \"]\")\n",
    "              case _ => cell.toString\n",
    "            }\n",
    "            if (truncate > 0 && str.length > truncate) {\n",
    "              // do not show ellipses for strings shorter than 4 characters.\n",
    "              if (truncate < 4) str.substring(0, truncate)\n",
    "              else str.substring(0, truncate - 3) + \"...\"\n",
    "            } else {\n",
    "              str\n",
    "            }\n",
    "          }: Seq[String]\n",
    "        }\n",
    "    publish.html(s\"\"\" <table>\n",
    "                <tr>\n",
    "                 ${header.map(h => s\"<th>${escape(h)}</th>\").mkString}\n",
    "                </tr>\n",
    "                ${rows.map {row =>\n",
    "                  s\"<tr>${row.map{c => s\"<td>${escape(c)}</td>\" }.mkString}</tr>\"\n",
    "                }.mkString}\n",
    "            </table>\n",
    "        \"\"\")\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[36mworld\u001b[39m: \u001b[32mDataFrame\u001b[39m = [name: string, continent: string ... 6 more fields]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val world = hiveCxt.table(\"sqlzoo.world\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1. Introduction\n",
    "\n",
    "[Read the notes about this table](https://sqlzoo.net/wiki/Read_the_notes_about_this_table.). Observe the result of running this SQL command to show the name, continent and population of all countries."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>continent</th><th>population</th>\n",
       "                </tr>\n",
       "                <tr><td>Afghanistan</td><td>Asia</td><td>3.222556E7</td></tr><tr><td>Albania</td><td>Europe</td><td>2845955.0</td></tr><tr><td>Algeria</td><td>Africa</td><td>4.3E7</td></tr><tr><td>Andorra</td><td>Europe</td><td>77543.0</td></tr><tr><td>Angola</td><td>Africa</td><td>3.1127674E7</td></tr><tr><td>Antigua and Barbuda</td><td>Caribbean</td><td>96453.0</td></tr><tr><td>Argentina</td><td>South America</td><td>4.4938712E7</td></tr><tr><td>Armenia</td><td>Eurasia</td><td>2957500.0</td></tr><tr><td>Australia</td><td>Oceania</td><td>2.5690023E7</td></tr><tr><td>Austria</td><td>Europe</td><td>8902600.0</td></tr><tr><td>Azerbaijan</td><td>Asia</td><td>1.0067108E7</td></tr><tr><td>Bahamas</td><td>Caribbean</td><td>385340.0</td></tr><tr><td>Bahrain</td><td>Asia</td><td>1543300.0</td></tr><tr><td>Bangladesh</td><td>Asia</td><td>1.68506944E8</td></tr><tr><td>Barbados</td><td>Caribbean</td><td>287025.0</td></tr><tr><td>Belarus</td><td>Europe</td><td>9408400.0</td></tr><tr><td>Belgium</td><td>Europe</td><td>1.1524454E7</td></tr><tr><td>Belize</td><td>North America</td><td>408487.0</td></tr><tr><td>Benin</td><td>Africa</td><td>1.1733059E7</td></tr><tr><td>Bhutan</td><td>Asia</td><td>741672.0</td></tr><tr><td>Bolivia</td><td>South America</td><td>1.1469896E7</td></tr><tr><td>Bosnia and Herzegovina</td><td>Europe</td><td>3301000.0</td></tr><tr><td>Botswana</td><td>Africa</td><td>2338851.0</td></tr><tr><td>Brazil</td><td>South America</td><td>2.11442625E8</td></tr><tr><td>Brunei</td><td>Asia</td><td>442400.0</td></tr><tr><td>Bulgaria</td><td>Europe</td><td>6951482.0</td></tr><tr><td>Burkina Faso</td><td>Africa</td><td>2.087006E7</td></tr><tr><td>Burundi</td><td>Africa</td><td>1.0953317E7</td></tr><tr><td>Cabo Verde</td><td>Africa</td><td>491875.0</td></tr><tr><td>Cambodia</td><td>Asia</td><td>1.5288489E7</td></tr><tr><td>Cameroon</td><td>Africa</td><td>2.6545864E7</td></tr><tr><td>Canada</td><td>North America</td><td>3.8007166E7</td></tr><tr><td>Central African Republic</td><td>Africa</td><td>5496011.0</td></tr><tr><td>Chad</td><td>Africa</td><td>1.6244513E7</td></tr><tr><td>Chile</td><td>South America</td><td>1.9107216E7</td></tr><tr><td>China</td><td>Asia</td><td>1.40237864E9</td></tr><tr><td>Colombia</td><td>South America</td><td>4.9395678E7</td></tr><tr><td>Comoros</td><td>Africa</td><td>873724.0</td></tr><tr><td>Congo, Democratic Republic of</td><td>Africa</td><td>6.936E7</td></tr><tr><td>Congo, Republic of</td><td>Africa</td><td>4559000.0</td></tr><tr><td>Costa Rica</td><td>North America</td><td>5058007.0</td></tr><tr><td>Côte d'Ivoire</td><td>Africa</td><td>2.3919E7</td></tr><tr><td>Croatia</td><td>Europe</td><td>4076246.0</td></tr><tr><td>Cuba</td><td>Caribbean</td><td>1.1209628E7</td></tr><tr><td>Cyprus</td><td>Asia</td><td>875900.0</td></tr><tr><td>Czech Republic</td><td>Europe</td><td>1.0693939E7</td></tr><tr><td>Denmark</td><td>Europe</td><td>5822763.0</td></tr><tr><td>Djibouti</td><td>Africa</td><td>1078373.0</td></tr><tr><td>Dominica</td><td>Caribbean</td><td>71808.0</td></tr><tr><td>Dominican Republic</td><td>Caribbean</td><td>1.035832E7</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "world.select($\"name\", $\"continent\", $\"population\").showHTML()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. Large Countries\n",
    "\n",
    "[How to use WHERE to filter records](https://sqlzoo.net/wiki/WHERE_filters). Show the name for the countries that have a population of at least 200 million. 200 million is 200000000, there are eight zeros."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th>\n",
       "                </tr>\n",
       "                <tr><td>Brazil</td></tr><tr><td>China</td></tr><tr><td>India</td></tr><tr><td>Indonesia</td></tr><tr><td>Nigeria</td></tr><tr><td>Pakistan</td></tr><tr><td>United States</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "world.filter($\"population\">=2e8).select($\"name\").showHTML()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3. Per capita GDP\n",
    "\n",
    "Give the `name` and the **per capita GDP** for those countries with a `population` of at least 200 million.\n",
    "\n",
    "> _HELP:How to calculate per capita GDP_   \n",
    "> per capita GDP is the GDP divided by the population GDP/population"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>pcgdp</th>\n",
       "                </tr>\n",
       "                <tr><td>Brazil</td><td>9721.37</td></tr><tr><td>China</td><td>8724.31</td></tr><tr><td>India</td><td>1891.78</td></tr><tr><td>Indonesia</td><td>3804.77</td></tr><tr><td>Nigeria</td><td>1822.89</td></tr><tr><td>Pakistan</td><td>1377.04</td></tr><tr><td>United States</td><td>59121.19</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world.withColumn(\"pcgdp\", round($\"gdp\"/$\"population\", 2))\n",
    "    .filter($\"population\" >= 2e8)\n",
    "    .select($\"name\", $\"pcgdp\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 4. South America In millions\n",
    "\n",
    "Show the `name` and `population` in millions for the countries of the `continent` 'South America'. Divide the population by 1000000 to get population in millions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>popl</th>\n",
       "                </tr>\n",
       "                <tr><td>Argentina</td><td>44.94</td></tr><tr><td>Bolivia</td><td>11.47</td></tr><tr><td>Brazil</td><td>211.44</td></tr><tr><td>Chile</td><td>19.11</td></tr><tr><td>Colombia</td><td>49.4</td></tr><tr><td>Ecuador</td><td>17.47</td></tr><tr><td>Guyana</td><td>0.78</td></tr><tr><td>Paraguay</td><td>7.25</td></tr><tr><td>Peru</td><td>32.13</td></tr><tr><td>Saint Vincent and the Grenadines</td><td>0.11</td></tr><tr><td>Suriname</td><td>0.58</td></tr><tr><td>Uruguay</td><td>3.52</td></tr><tr><td>Venezuela</td><td>32.22</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world.withColumn(\"popl\", round($\"population\"/1e6, 2))\n",
    "    .filter($\"continent\" === \"South America\")\n",
    "    .select($\"name\", $\"popl\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 5. France, Germany, Italy\n",
    "\n",
    "Show the `name` and `population` for France, Germany, Italy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>population</th>\n",
       "                </tr>\n",
       "                <tr><td>France</td><td>6.7076E7</td></tr><tr><td>Germany</td><td>8.31493E7</td></tr><tr><td>Italy</td><td>6.0238522E7</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[36mlistVal\u001b[39m: \u001b[32mSeq\u001b[39m[\u001b[32mString\u001b[39m] = \u001b[33mList\u001b[39m(\u001b[32m\"France\"\u001b[39m, \u001b[32m\"Germany\"\u001b[39m, \u001b[32m\"Italy\"\u001b[39m)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val listVal = Seq(\"France\", \"Germany\", \"Italy\")\n",
    "(world.filter($\"name\".isin(listVal: _*))\n",
    "     .select($\"name\", $\"population\")\n",
    "     .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6. United\n",
    "\n",
    "Show the countries which have a `name` that includes the word 'United'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th>\n",
       "                </tr>\n",
       "                <tr><td>United Arab Emirates</td></tr><tr><td>United Kingdom</td></tr><tr><td>United States</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world.filter($\"name\".contains(\"United\"))\n",
    " .select($\"name\").showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 7. Two ways to be big\n",
    "\n",
    "Two ways to be big: A country is **big** if it has an area of more than 3 million sq km or it has a population of more than 250 million.\n",
    "\n",
    "**Show the countries that are big by area or big by population. Show name, population and area.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>population</th><th>area</th>\n",
       "                </tr>\n",
       "                <tr><td>Australia</td><td>2.5690023E7</td><td>7692024.0</td></tr><tr><td>Brazil</td><td>2.11442625E8</td><td>8515767.0</td></tr><tr><td>Canada</td><td>3.8007166E7</td><td>9984670.0</td></tr><tr><td>China</td><td>1.40237864E9</td><td>9596961.0</td></tr><tr><td>India</td><td>1.361503224E9</td><td>3166414.0</td></tr><tr><td>Indonesia</td><td>2.669119E8</td><td>1904569.0</td></tr><tr><td>Russia</td><td>1.46745098E8</td><td>1.7125242E7</td></tr><tr><td>United States</td><td>3.29583916E8</td><td>9826675.0</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world.filter(($\"area\" > 3e6) || ($\"population\" > 2.5e8))\n",
    "    .select($\"name\", $\"population\", $\"area\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 8. One or the other (but not both)\n",
    "\n",
    "**Exclusive OR (XOR). Show the countries that are big by area (more than 3 million) or big by population (more than 250 million) but not both. Show name, population and area.**\n",
    "\n",
    "- Australia has a big area but a small population, it should be **included**.\n",
    "- Indonesia has a big population but a small area, it should be **included**.\n",
    "- China has a big population **and** big area, it should be **excluded**.\n",
    "- United Kingdom has a small population and a small area, it should be **excluded**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>population</th><th>area</th>\n",
       "                </tr>\n",
       "                <tr><td>Australia</td><td>2.5690023E7</td><td>7692024.0</td></tr><tr><td>Brazil</td><td>2.11442625E8</td><td>8515767.0</td></tr><tr><td>Canada</td><td>3.8007166E7</td><td>9984670.0</td></tr><tr><td>Indonesia</td><td>2.669119E8</td><td>1904569.0</td></tr><tr><td>Russia</td><td>1.46745098E8</td><td>1.7125242E7</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world.filter(($\"area\" > 3e6) !== ($\"population\" > 2.5e8))\n",
    "     .select($\"name\", $\"population\", $\"area\")\n",
    "     .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 9. Rounding\n",
    "\n",
    "Show the `name` and `population` in millions and the GDP in billions for the countries of the `continent` 'South America'. Use the [ROUND](https://sqlzoo.net/wiki/ROUND) function to show the values to two decimal places.\n",
    "\n",
    "**For South America show population in millions and GDP in billions both to 2 decimal places.**\n",
    "\n",
    "> _Millions and billions_    \n",
    "> Divide by 1000000 (6 zeros) for millions. Divide by 1000000000 (9 zeros) for billions."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>popl</th><th>gdp_</th>\n",
       "                </tr>\n",
       "                <tr><td>Argentina</td><td>44.94</td><td>637.49</td></tr><tr><td>Bolivia</td><td>11.47</td><td>37.51</td></tr><tr><td>Brazil</td><td>211.44</td><td>2055.51</td></tr><tr><td>Chile</td><td>19.11</td><td>277.08</td></tr><tr><td>Colombia</td><td>49.4</td><td>309.19</td></tr><tr><td>Ecuador</td><td>17.47</td><td>104.3</td></tr><tr><td>Guyana</td><td>0.78</td><td>3.09</td></tr><tr><td>Paraguay</td><td>7.25</td><td>29.44</td></tr><tr><td>Peru</td><td>32.13</td><td>211.4</td></tr><tr><td>Saint Vincent and the Grenadines</td><td>0.11</td><td>0.73</td></tr><tr><td>Suriname</td><td>0.58</td><td>5.21</td></tr><tr><td>Uruguay</td><td>3.52</td><td>59.18</td></tr><tr><td>Venezuela</td><td>32.22</td><td>255.09</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world.filter($\"continent\" === \"South America\")\n",
    "    .withColumn(\"popl\", round($\"population\"/1e6, 2))\n",
    "    .withColumn(\"gdp_\", round($\"gdp\"/1e9, 2))\n",
    "    .select($\"name\", $\"popl\", $\"gdp_\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 10. Trillion dollar economies\n",
    "\n",
    "Show the `name` and per-capita GDP for those countries with a GDP of at least one trillion (1000000000000; that is 12 zeros). Round this value to the nearest 1000.\n",
    "\n",
    "**Show per-capita GDP for the trillion dollar countries to the nearest $1000.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>pcgdp</th>\n",
       "                </tr>\n",
       "                <tr><td>Australia</td><td>55000.0</td></tr><tr><td>Brazil</td><td>10000.0</td></tr><tr><td>Canada</td><td>43000.0</td></tr><tr><td>China</td><td>9000.0</td></tr><tr><td>France</td><td>39000.0</td></tr><tr><td>Germany</td><td>44000.0</td></tr><tr><td>India</td><td>2000.0</td></tr><tr><td>Indonesia</td><td>4000.0</td></tr><tr><td>Italy</td><td>32000.0</td></tr><tr><td>Japan</td><td>39000.0</td></tr><tr><td>Mexico</td><td>9000.0</td></tr><tr><td>Russia</td><td>10000.0</td></tr><tr><td>South Korea</td><td>22000.0</td></tr><tr><td>Spain</td><td>28000.0</td></tr><tr><td>United Kingdom</td><td>40000.0</td></tr><tr><td>United States</td><td>59000.0</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world\n",
    " .withColumn(\"pcgdp\", round($\"gdp\" / (lit(1000) * $\"population\"), 0) * lit(1000))\n",
    " .filter($\"gdp\" > 1e12)\n",
    " .select($\"name\", $\"pcgdp\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 11. Name and capital have the same length\n",
    "\n",
    "Greece has capital Athens.\n",
    "\n",
    "Each of the strings 'Greece', and 'Athens' has 6 characters.\n",
    "\n",
    "**Show the name and capital where the name and the capital have the same number of characters.**\n",
    "\n",
    "- You can use the [LENGTH](https://sqlzoo.net/wiki/LENGTH) function to find the number of characters in a string"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>capital</th>\n",
       "                </tr>\n",
       "                <tr><td>Algeria</td><td>Algiers</td></tr><tr><td>Angola</td><td>Luanda</td></tr><tr><td>Armenia</td><td>Yerevan</td></tr><tr><td>Botswana</td><td>Gaborone</td></tr><tr><td>Canada</td><td>Ottowa</td></tr><tr><td>Djibouti</td><td>Djibouti</td></tr><tr><td>Egypt</td><td>Cairo</td></tr><tr><td>Estonia</td><td>Tallinn</td></tr><tr><td>Fiji</td><td>Suva</td></tr><tr><td>Gambia</td><td>Banjul</td></tr><tr><td>Georgia</td><td>Tbilisi</td></tr><tr><td>Ghana</td><td>Accra</td></tr><tr><td>Greece</td><td>Athens</td></tr><tr><td>Luxembourg</td><td>Luxembourg</td></tr><tr><td>Mauritania</td><td>Nouakchott</td></tr><tr><td>Paraguay</td><td>Asunción</td></tr><tr><td>Peru</td><td>Lima</td></tr><tr><td>Poland</td><td>Warsaw</td></tr><tr><td>Russia</td><td>Moscow</td></tr><tr><td>Rwanda</td><td>Kigali</td></tr><tr><td>San Marino</td><td>San Marino</td></tr><tr><td>Singapore</td><td>Singapore</td></tr><tr><td>Taiwan</td><td>Taipei</td></tr><tr><td>Togo</td><td>Lomé</td></tr><tr><td>Turkey</td><td>Ankara</td></tr><tr><td>Zambia</td><td>Lusaka</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world.filter(length($\"name\") === length($\"capital\"))\n",
    "    .select($\"name\", $\"capital\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 12. Matching name and capital\n",
    "\n",
    "The capital of Sweden is Stockholm. Both words start with the letter 'S'.\n",
    "\n",
    "**Show the name and the capital where the first letters of each match. Don't include countries where the name and the capital are the same word.**\n",
    "\n",
    "- You can use the function [LEFT](https://sqlzoo.net/wiki/LEFT) to isolate the first character.\n",
    "- You can use <> as the **NOT EQUALS** operator."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th><th>capital</th>\n",
       "                </tr>\n",
       "                <tr><td>Algeria</td><td>Algiers</td></tr><tr><td>Andorra</td><td>Andorra la Vella</td></tr><tr><td>Barbados</td><td>Bridgetown</td></tr><tr><td>Belize</td><td>Belmopan</td></tr><tr><td>Brazil</td><td>Brasília</td></tr><tr><td>Brunei</td><td>Bandar Seri Begawan</td></tr><tr><td>Burundi</td><td>Bujumbura</td></tr><tr><td>Djibouti</td><td>Djibouti</td></tr><tr><td>Guatemala</td><td>Guatemala City</td></tr><tr><td>Guyana</td><td>Georgetown</td></tr><tr><td>Kuwait</td><td>Kuwait City</td></tr><tr><td>Luxembourg</td><td>Luxembourg</td></tr><tr><td>Maldives</td><td>Malé</td></tr><tr><td>Marshall Islands</td><td>Majuro</td></tr><tr><td>Mexico</td><td>Mexico City</td></tr><tr><td>Monaco</td><td>Monaco-Ville</td></tr><tr><td>Mozambique</td><td>Maputo</td></tr><tr><td>Niger</td><td>Niamey</td></tr><tr><td>Panama</td><td>Panama City</td></tr><tr><td>Papua New Guinea</td><td>Port Moresby</td></tr><tr><td>San Marino</td><td>San Marino</td></tr><tr><td>Sao Tomé and Príncipe</td><td>São Tomé</td></tr><tr><td>Singapore</td><td>Singapore</td></tr><tr><td>South Korea</td><td>Seoul</td></tr><tr><td>Sri Lanka</td><td>Sri Jayawardenepura Kotte</td></tr><tr><td>Sweden</td><td>Stockholm</td></tr><tr><td>Taiwan</td><td>Taipei</td></tr><tr><td>Tunisia</td><td>Tunis</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world.filter(substring($\"name\", 1, 1) === substring($\"capital\", 1, 1))\n",
    "    .select($\"name\", $\"capital\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 13. All the vowels\n",
    "\n",
    "**Equatorial Guinea** and **Dominican Republic** have all of the vowels (a e i o u) in the name. They don't count because they have more than one word in the name.\n",
    "\n",
    "**Find the country that has all the vowels and no spaces in its name.**\n",
    "\n",
    "- You can use the phrase name `NOT LIKE '%a%'` to exclude characters from your results.\n",
    "- The query shown misses countries like Bahamas and Belarus because they contain at least one 'a'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>name</th>\n",
       "                </tr>\n",
       "                <tr><td>Mozambique</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(world.filter($\"name\".rlike(\"[Aa]\") &&\n",
    "              $\"name\".rlike(\"[Ee]\") &&\n",
    "              $\"name\".rlike(\"[Ii]\") &&\n",
    "              $\"name\".rlike(\"[Oo]\") &&\n",
    "              $\"name\".rlike(\"[Uu]\") &&\n",
    "              $\"name\".rlike(\"^\\\\S+$\"))\n",
    "    .select($\"name\")\n",
    "    .showHTML())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Scala",
   "language": "scala",
   "name": "scala"
  },
  "language_info": {
   "codemirror_mode": "text/x-scala",
   "file_extension": ".sc",
   "mimetype": "text/x-scala",
   "name": "scala",
   "nbconvert_exporter": "script",
   "version": "2.12.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
